import pandas as pd
import os
import dotenv
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
from mybooks.library import Library
dotenv.load_dotenv()
True
library = Library(
api_key=os.getenv('API_GET'),
file="./library.csv"
)
library.library['description'].head(10)
0 Le jour de ses onze ans, Harry Potter, un orph... 1 Cette année, Harry a dix-sept ans et ne retour... 2 Sirius Black, le dangereux criminel qui s’est ... 3 À quinze ans, Harry entre en cinquième année à... 4 Dans un monde de plus en plus inquiétant, Harr... 5 Une rentrée fracassante en voiture volante, un... 6 Harry Potter a quatorze ans et entre en quatri... 7 Dans un monde de plus en plus inquiétant, Harr... 8 La vie de Camille bascule quand elle pénètre d... 9 À Gwendalavir, Ewilan se prépare à partir pour... Name: description, dtype: object
library.library['description'].apply(len).plot.kde(xlim=[0,3000])
<AxesSubplot:ylabel='Density'>
library.library['authors'].apply(lambda x:x[0]).value_counts().head(30).plot.barh(
figsize=(10,15),
title="# Books per Author (in french)"
)
<AxesSubplot:title={'center':'# Books per Author (in french)'}>
vectorizer = TfidfVectorizer(
lowercase=True,
strip_accents="ascii",
analyzer="word",
stop_words=None,
ngram_range=(1,3),
max_df=0.8,
min_df=0.0,
max_features=10000
)
%time vectors = vectorizer.fit_transform(library.library['description'])
vectors.shape
CPU times: user 951 ms, sys: 45 ms, total: 996 ms Wall time: 1.01 s
(1287, 10000)
reducer = umap.UMAP(
n_neighbors=15,
n_components=2,
metric="euclidean",
random_state=42,
verbose=False,
)
%time embedding = reducer.fit_transform(vectors)
embedding.shape
CPU times: user 9.75 s, sys: 182 ms, total: 9.93 s Wall time: 9.47 s
(1287, 2)
plt.figure(figsize=(10,10))
for i in range(embedding.shape[0]):
x,y = embedding[i]
plt.scatter(
x,
y,
c="r",
marker="x"
)
try:
plt.text(x+.03, y+.03, library.library.at[i, "title"], fontsize=9)
except:
pass
data = library.library.copy()
data["x"] = embedding[:,0].tolist()
data['y'] = embedding[:,1].tolist()
# Give a color per author
data['author'] = data['authors'].apply(lambda x: x[0])
authors_names = data['author'].unique().tolist()
colors = plt.cm.get_cmap("hsv", len(authors_names))
author_to_color = {
authors_names[i]: matplotlib.colors.to_hex(colors(i))
for i in range(len(authors_names))
}
data['color'] = data['author'].apply(lambda x: author_to_color[x])
#print(data['color'])
fig = go.Figure(
data=go.Scatter(
x=data['x'],
y=data['y'],
mode='markers',
marker_color=data['color'],
text=data['title']
)
) # hover text goes here
fig.update_layout(title='Books Space')
fig.show()